from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# 加载文档
loader = PyPDFLoader("Java开发手册(黄山版).pdf")
documents = loader.load()

# 分割文档
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # 每个块的大小
    chunk_overlap=200  # 块之间的重叠
)
docs = text_splitter.split_documents(documents)

# 初始化嵌入模型
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 将文档嵌入并存储到Chroma向量数据库
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory="./chroma_db")

# 持久化向量数据库（以便后续使用）
vectorstore.persist()